Gender bias in speakers and career position
Data
EcoEncontros Seminar talks
Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil
See file metadata.txt, in folder data for more description and detail of the dataset.
data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date)
#skimr::skim(data)Excluding special events as round tables and discussions not related to a project or study presented by someone.
IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)For this specific analysis, excluding speakers that are not in academia (âothersâ), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.
*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.
data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student",
"postdoc","professor")Creating dummy column to indicate if the speaker is a female (1) or not (0)
data$fem <- 1
data$fem[data$gender == "M"] <- 0 Population data from PPGE-USP from 2008-2019
Number of students, postdocs and professors in the PPGE-USP per gender and year.
pop <- read.table("data/pop_PPGE_2008-2019.csv", sep=",",
header=T, as.is=T)
kable(pop)| year | student_F | student_M | professor_F | professor_M | postdoc_F | postdoc_M | total_F | total_M |
|---|---|---|---|---|---|---|---|---|
| 2008 | 51 | 24 | 4 | 3 | 2 | 1 | 57 | 28 |
| 2009 | 53 | 23 | 1 | 3 | 4 | 3 | 58 | 29 |
| 2010 | 51 | 24 | 1 | 4 | 4 | 3 | 56 | 31 |
| 2011 | 54 | 31 | 1 | 4 | 7 | 3 | 62 | 38 |
| 2012 | 52 | 31 | 2 | 7 | 8 | 4 | 62 | 42 |
| 2013 | 57 | 39 | 5 | 7 | 8 | 9 | 70 | 55 |
| 2014 | 56 | 39 | 6 | 6 | 5 | 12 | 67 | 57 |
| 2015 | 52 | 39 | 8 | 8 | 3 | 15 | 63 | 62 |
| 2016 | 51 | 38 | 9 | 8 | 4 | 14 | 64 | 60 |
| 2017 | 43 | 36 | 9 | 9 | 7 | 14 | 59 | 59 |
| 2018 | 42 | 33 | 8 | 9 | 10 | 12 | 60 | 54 |
| 2019 | 41 | 37 | 2 | 6 | 10 | 7 | 53 | 50 |
Data description
dim(data)## [1] 330 31
Speakers data
Proportion and number of male and female speakers per academic position.
data %>% tabyl(position_cat, gender) %>% adorn_percentages("row") %>%
adorn_pct_formatting(digits = 0) %>%
adorn_ns() %>%
kable(caption="Proportion and number (in parenthesis) of females (F) and males (M) per academic position category.")| position_cat | F | M |
|---|---|---|
| student | 53% (91) | 47% (82) |
| postdoc | 44% (25) | 56% (32) |
| professor | 25% (25) | 75% (75) |
n.y <- data %>% tabyl(position_cat, gender)
ggplot(data, aes(x=position_cat, fill=gender)) + geom_bar() +
ylab("Number of speakers") + xlab("") +
scale_fill_manual("gender", values = c("#b2abd2", "#fdb863")) +
scale_color_discrete(name="GĂȘnero") +
theme(text = element_text(size=18),
axis.text.x = element_text(size=16)) +
annotate("text", x=1:3, y=n.y$M+n.y$F/2, size=5,
label = c("52%", "43%","24%"))ggsave("figures/numberSpeakers_position.jpeg", units="in", width=7, height=4.5, dpi=300)Variation in time.
Origin of the speakers
There were 143 (43%) talks given by people from the PPGE population.
Including talks from the Institute of Biosciences, USP, there were 180 (55%).
PPGE-USP population data
PPGE-USP population size by gender in time
PPGE-USP population size by gender and position and year
pop2 %>% mutate(ytext = M + F/2) %>%
pivot_longer(7:8, names_to = "gender", values_to = "N") %>%
ggplot(aes(x=as.factor(year),y=N, fill=gender)) + geom_col() +
facet_wrap(~category, scales="free",ncol=1)+
ylab("N") + xlab("") +
scale_fill_manual("gender", values = c("#b2abd2", "#fdb863")) +
theme(text = element_text(size=18),
axis.text.x = element_text(size=16, angle=45, hjust=1)) +
geom_text(aes(x=as.factor(year), y=ytext,
label=paste0(round(propFcat*100), "%") ))ggsave("figures/popSize_positionYear.jpeg", units="in", width=7, height=14, dpi=300)Comparing proportions of female speakers in the seminar and in the population by position.
propS <- data %>% group_by(year,position_cat,gender) %>%
count() %>%
pivot_wider(names_from = gender, values_from = n, values_fill = 0) %>%
mutate(propFspeaker = F/(M+F))
proportions <- pop2 %>% dplyr::select(year, category, propFcat, F,M) %>%
rename("position_cat"="category") %>%
left_join(propS, by=c("year", "position_cat")) %>%
mutate(position_cat = fct_relevel(position_cat, "student", "postdoc", "professor"))
ggplot(proportions,aes(x=propFcat, y=propFspeaker, col=year)) +
scale_color_gradient()+
geom_point() +
facet_wrap(~position_cat)+
xlim(0,1) +
xlab("Proportion of females in the population") +
ylab("Proportion of females as speakers")+
geom_abline(slope=1,intercept=0, linetype="dashed") +
geom_hline(yintercept = 0.5, linetype="dashed", col="lightgray")+
geom_vline(xintercept = 0.5, linetype="dashed", col="lightgray")Relationship between the proportion of females in the PPGE population in each category of academic position in each year (x axis) and the proportion of female speakers in the same category and year. Horizontal and vertical gray dashed lines are the 50% of each proportion and diagonal black dashed line indicates where the propotions are similar.
ggsave("figures/propFemale_popXspeaker.jpeg", units="in", width=14, height=5, dpi=300)By year
proportions %>% pivot_longer(c(3,8), names_to = "data",values_to = "proportion") %>%
ggplot(aes(x=year, y=proportion, col=data)) +
geom_point() +
scale_color_manual(name="Dataset",values = 1:2,
labels=c("PPGE population", "Speakers pop"))+
geom_smooth(method="lm") +
facet_wrap(~position_cat, ncol=1)+
ylab("Proportion of females")+
geom_hline(yintercept = 0.5, linetype="dashed", col="lightgray")ggsave("figures/propFemale_popXspeaker_byYear.jpeg", units="in", width=7, height=10, dpi=300)Modeling
Proportions of female speakers by academic position and time.
Binomial models with the response variable as 0 if the speaker is male or 1 if female. Response varibles as year and academic position.
OBS: Starting in 2018, the Ecoencontros students committee tried actively to balance gender in presentations as an affirmative policy in the group. Becase of that we also analyzed if the proportions varied between before and after the policy.
data$affirm_action <- ifelse(data$year<2018,"before", "after")
data$affirm_action <- fct_relevel(data$affirm_action,"before", "after")There were 74 seminars before and 256 after the affirmative actions.
mod0 <- glm(fem ~ 1, family=binomial, data= data)
mod1 <- glm(fem ~ year, family=binomial, data= data)
mod2 <- glm(fem ~ affirm_action, family=binomial, data= data)
mod3 <- glm(fem ~ position_cat, family=binomial, data= data)
mod4 <- glm(fem ~ position_cat + year, family=binomial, data= data)
mod5 <- glm(fem ~ position_cat*year, family=binomial, data= data)
mod6 <- glm(fem ~ position_cat + affirm_action, family=binomial, data= data)
mod7 <- glm(fem ~ position_cat*affirm_action, family=binomial, data= data)
kable(AICtab(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7, base=T, weights=T), digits=2)| AIC | dAIC | df | weight | |
|---|---|---|---|---|
| mod7 | 434.13 | 0.00 | 6 | 0.42 |
| mod5 | 434.98 | 0.84 | 6 | 0.28 |
| mod3 | 435.98 | 1.85 | 3 | 0.17 |
| mod6 | 437.56 | 3.43 | 4 | 0.08 |
| mod4 | 437.98 | 3.85 | 4 | 0.06 |
| mod0 | 452.47 | 18.34 | 1 | 0.00 |
| mod2 | 453.11 | 18.98 | 2 | 0.00 |
| mod1 | 454.42 | 20.28 | 2 | 0.00 |
#as.data.frame(AICtab(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7, base=T, weights=T)) %>%
# mutate_at(c(1,2,4), round, digits=2) %>% kable()Best model is the one with interaction between academic position and affirmative actions (as time event), but it is equaly plausible with the model with year as time event and only academic position.
Residual diagnostic of the selected models
All equaly plausible models presented satisfactory residual diagnostic.
hnp(mod7)## Binomial model
hnp(mod5)## Binomial model
hnp(mod3)## Binomial model
plot(simulateResiduals(mod7))plot(simulateResiduals(mod5))plot(simulateResiduals(mod3))Models results
summary(mod7)##
## Call:
## glm(formula = fem ~ position_cat * affirm_action, family = binomial,
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5518 -1.1073 -0.6945 1.0901 1.7552
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2088 0.1799 1.161 0.2458
## position_catpostdoc -0.6857 0.3498 -1.960 0.0500 *
## position_catprofessor -1.5080 0.3210 -4.698 2.63e-06 ***
## affirm_actionafter -0.3758 0.3410 -1.102 0.2704
## position_catpostdoc:affirm_actionafter 1.7000 0.8261 2.058 0.0396 *
## position_catprofessor:affirm_actionafter 1.4238 0.6640 2.144 0.0320 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.47 on 329 degrees of freedom
## Residual deviance: 422.13 on 324 degrees of freedom
## AIC: 434.13
##
## Number of Fisher Scoring iterations: 4
performance::r2(mod7)## # R2 for Logistic Regression
## Tjur's R2: 0.082
my7 <- ggpredict(mod7, terms=c("position_cat","affirm_action"))
plot(my7) +
geom_hline(yintercept = 0.5, linetype="dashed")Figure proportion of female speakers by position_cat e affirm_action
suma <- data %>% count(position_cat, affirm_action,fem)
prs <- as.data.frame(my7)
ggplot(suma, aes(x=position_cat, y=fem,col=affirm_action))+
geom_point(aes(, size=n),position=position_dodge(0.6), alpha=0.2, show_guides=F) +
scale_size(range=c(1,10), breaks = c(3,10,20,60))+
geom_pointrange(data=prs, aes(x=x, y=predicted, col=group,
ymax=conf.high, ymin=conf.low),
position=position_dodge(0.6)) +
geom_hline(yintercept = 0.5, linetype="dashed", col='gray') +
scale_color_manual(name="Affirmative \n actions",
values = c("goldenrod", "green4")) +
ylab("Proportion of female speakers")+
xlab("Academic position")+
theme(text = element_text(size=20),
axis.text = element_text(size=18))ggsave("figures/prop_female_speakers.jpeg", width=9, height = 6) summary(mod5)##
## Call:
## glm(formula = fem ~ position_cat * year, family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4358 -1.0948 -0.7194 1.1282 1.8046
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 126.68013 86.87568 1.458 0.1448
## position_catpostdoc -482.23276 199.27943 -2.420 0.0155 *
## position_catprofessor -255.41782 187.46781 -1.362 0.1731
## year -0.06284 0.04313 -1.457 0.1451
## position_catpostdoc:year 0.23924 0.09893 2.418 0.0156 *
## position_catprofessor:year 0.12622 0.09308 1.356 0.1751
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.47 on 329 degrees of freedom
## Residual deviance: 422.98 on 324 degrees of freedom
## AIC: 434.98
##
## Number of Fisher Scoring iterations: 4
performance::r2(mod5)## # R2 for Logistic Regression
## Tjur's R2: 0.081
my5 <- ggpredict(mod5, terms=c("year", "position_cat"))
plot(my5) +
geom_hline(yintercept = 0.5, linetype="dashed")summary(mod3)##
## Call:
## glm(formula = fem ~ position_cat, family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2219 -1.0745 -0.7585 1.1335 1.6651
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.1041 0.1523 0.684 0.494
## position_catpostdoc -0.3510 0.3073 -1.142 0.253
## position_catprofessor -1.2028 0.2766 -4.348 1.37e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.47 on 329 degrees of freedom
## Residual deviance: 429.98 on 327 degrees of freedom
## AIC: 435.98
##
## Number of Fisher Scoring iterations: 4
performance::r2(mod3)## # R2 for Logistic Regression
## Tjur's R2: 0.060
my3 <- ggpredict(mod3, terms=c("position_cat"))
plot(my3) +
geom_hline(yintercept = 0.5, linetype="dashed")Proportions of female speakers by academic position and time - controling by population proportions
The models below controls by differences in gender ratios in PPGE population by academic position. For that, we included a control variable in every model in the model set with the ratio of females in the year and by position.
data <- data %>% left_join(pop2[,c(1:2,10)], by=c("year", "position_cat"="category"))modc0 <- glm(fem ~ 1 + ratioFcat, family=binomial, data= data)
modc1 <- glm(fem ~ year + ratioFcat, family=binomial, data= data)
modc2 <- glm(fem ~ affirm_action + ratioFcat, family=binomial, data= data)
modc3 <- glm(fem ~ position_cat+ ratioFcat, family=binomial, data= data)
modc4 <- glm(fem ~ position_cat + year + ratioFcat, family=binomial,
data= data)
modc5 <- glm(fem ~ position_cat*year + ratioFcat, family=binomial, data= data)
modc6 <- glm(fem ~ position_cat +affirm_action + ratioFcat, family=binomial,
data= data)
modc7 <- glm(fem ~ position_cat*affirm_action + ratioFcat, family=binomial,
data= data)
AICtab(modc0,modc1,modc2,modc3,modc4,modc5,modc6,modc7, base=T, weights=T)## AIC dAIC df weight
## modc7 436.1 0.0 7 0.364
## modc5 436.9 0.7 7 0.252
## modc3 437.6 1.5 4 0.173
## modc6 438.9 2.7 5 0.093
## modc4 439.4 3.3 5 0.070
## modc1 442.2 6.1 3 0.018
## modc2 442.2 6.1 3 0.018
## modc0 442.8 6.7 2 0.013
Similar results as without the ratio of females in the PPGE population
Residual diagnostic of the selected models
All equaly plausible models presented satisfactory residual diagnostic.
par(mfrow=c(2,2))
hnp(modc7)## Binomial model
hnp(modc5)## Binomial model
hnp(modc3)## Binomial model
plot(simulateResiduals(modc7))plot(simulateResiduals(modc5))plot(simulateResiduals(modc3)) Comparing both models - controling and not controling
AICtab(mod0,mod1,mod2,mod3,mod4,mod5,mod6,mod7,
modc0,modc1,modc2,modc3,modc4,modc5,modc6,modc7, base=T, weights=T)## AIC dAIC df weight
## mod7 434.1 0.0 6 0.2948
## mod5 435.0 0.8 6 0.1933
## mod3 436.0 1.8 3 0.1169
## modc7 436.1 2.0 7 0.1088
## modc5 436.9 2.7 7 0.0752
## mod6 437.6 3.4 4 0.0531
## modc3 437.6 3.5 4 0.0517
## mod4 438.0 3.8 4 0.0430
## modc6 438.9 4.7 5 0.0278
## modc4 439.4 5.3 5 0.0208
## modc1 442.2 8.0 3 0.0053
## modc2 442.2 8.1 3 0.0052
## modc0 442.8 8.7 2 0.0038
## mod0 452.5 18.3 1 <0.001
## mod2 453.1 19.0 2 <0.001
## mod1 454.4 20.3 2 <0.001
Models results
Using 1:1 population gender ratio
summary(modc7)##
## Call:
## glm(formula = fem ~ position_cat * affirm_action + ratioFcat,
## family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5551 -1.1067 -0.6921 1.0936 1.7596
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.5000 0.4062 -1.231 0.2184
## position_catprofessor -0.8154 0.4094 -1.992 0.0464 *
## position_catstudent 0.6668 0.4150 1.607 0.1081
## affirm_actionafter 1.3190 0.7550 1.747 0.0806 .
## ratioFcat 0.0250 0.2966 0.084 0.9328
## position_catprofessor:affirm_actionafter -0.2694 0.9474 -0.284 0.7762
## position_catstudent:affirm_actionafter -1.6822 0.8525 -1.973 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.47 on 329 degrees of freedom
## Residual deviance: 422.13 on 323 degrees of freedom
## AIC: 436.13
##
## Number of Fisher Scoring iterations: 4
performance::r2(modc7)## # R2 for Logistic Regression
## Tjur's R2: 0.082
myc7 <- ggpredict(modc7, terms=c("position_cat","affirm_action",
"ratioFcat[1]"))
plot(myc7) +
geom_hline(yintercept = 0.5, linetype="dashed")summary(modc5)##
## Call:
## glm(formula = fem ~ position_cat * year + ratioFcat, family = binomial,
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4938 -1.0979 -0.7081 1.1347 1.8100
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -381.95004 196.57186 -1.943 0.0520 .
## position_catprofessor 264.94673 270.94859 0.978 0.3281
## position_catstudent 481.68398 199.33201 2.416 0.0157 *
## year 0.18944 0.09751 1.943 0.0520 .
## ratioFcat 0.13239 0.39651 0.334 0.7385
## position_catprofessor:year -0.13193 0.13450 -0.981 0.3266
## position_catstudent:year -0.23900 0.09896 -2.415 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.47 on 329 degrees of freedom
## Residual deviance: 422.87 on 323 degrees of freedom
## AIC: 436.87
##
## Number of Fisher Scoring iterations: 4
performance::r2(modc5)## # R2 for Logistic Regression
## Tjur's R2: 0.081
myc5 <- ggpredict(modc5, terms=c("year", "position_cat",
"ratioFcat[1]"))
plot(myc5) +
geom_hline(yintercept = 0.5, linetype="dashed")summary(modc3)##
## Call:
## glm(formula = fem ~ position_cat + ratioFcat, family = binomial,
## data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2758 -1.1441 -0.7425 1.1473 1.6932
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.4022 0.3711 -1.084 0.278
## position_catprofessor -0.7993 0.3637 -2.198 0.028 *
## position_catstudent 0.2581 0.3433 0.752 0.452
## ratioFcat 0.1616 0.2669 0.605 0.545
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 450.47 on 329 degrees of freedom
## Residual deviance: 429.62 on 326 degrees of freedom
## AIC: 437.62
##
## Number of Fisher Scoring iterations: 4
performance::r2(modc3)## # R2 for Logistic Regression
## Tjur's R2: 0.061
myc3 <- ggpredict(modc3, terms=c("position_cat", "ratioFcat[1]"))
plot(myc3) +
geom_hline(yintercept = 0.5, linetype="dashed")